knitr::opts_chunk$set(echo = FALSE)
needed_packages <- c("VIM", "tidyverse","pastecs", "ggplot2", "semTools", "psych", "FSA", "car", "effectsize", "coin", "rstatix", "sjstats", "userfriendlyscience", "stats", "foreign", "gmodels", "lm.beta","stargazer", "lmtest", "DescTools", "nnet", "reshape2", "generalhoslem", "Epi", "arm", "regclass", "olsrr","REdaS", "Hmisc","corrplot","ggcorrplot", "factoextra", "nFactors","readxl")   

# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[, "Package"])]    
# Install not installed packages
if(length(not_installed)) 
  install.packages(not_installed) 

library(pastecs) #For creating descriptive statistic summaries
library(ggplot2) #For creating histograms with more detail than plot
library(semTools) #For skewness and kurtosis
## Loading required package: lavaan
## This is lavaan 0.6-7
## lavaan is BETA software! Please report any bugs.
## 
## ###############################################################################
## This is semTools 0.5-3
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
library(psych)  #For descriptive functions
## 
## Attaching package: 'psych'
## The following object is masked from 'package:semTools':
## 
##     skew
## The following object is masked from 'package:lavaan':
## 
##     cor2cov
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(FSA) #For percentage
## ## FSA v0.8.30. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
## 
## Attaching package: 'FSA'
## The following object is masked from 'package:psych':
## 
##     headtail
library(car) # For Levene's test for homogeneity of variance and  test for colinearity of predictors
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:FSA':
## 
##     bootCase
## The following object is masked from 'package:psych':
## 
##     logit
library(effectsize) #To calculate effect size for t-test
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%()       masks ggplot2::%+%()
## x psych::alpha()     masks ggplot2::alpha()
## x readr::clipboard() masks semTools::clipboard()
## x tidyr::extract()   masks pastecs::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks pastecs::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks pastecs::last()
## x dplyr::recode()    masks car::recode()
## x purrr::some()      masks car::some()
library(coin) # For Wilcox test (non-parametric)
## Loading required package: survival
library(rstatix) # For calculating effect size
## 
## Attaching package: 'rstatix'
## The following objects are masked from 'package:coin':
## 
##     chisq_test, friedman_test, kruskal_test, sign_test, wilcox_test
## The following objects are masked from 'package:effectsize':
## 
##     cohens_d, eta_squared
## The following object is masked from 'package:stats':
## 
##     filter
library(sjstats) #calculate effect size for t-test
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## 
## Attaching package: 'sjstats'
## The following objects are masked from 'package:effectsize':
## 
##     cohens_f, phi
## The following object is masked from 'package:FSA':
## 
##     se
## The following object is masked from 'package:psych':
## 
##     phi
library(userfriendlyscience)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Registered S3 methods overwritten by 'ufs':
##   method                     from               
##   grid.draw.ggProportionPlot userfriendlyscience
##   pander.associationMatrix   userfriendlyscience
##   pander.dataShape           userfriendlyscience
##   pander.descr               userfriendlyscience
##   pander.normalityAssessment userfriendlyscience
##   print.CramersV             userfriendlyscience
##   print.associationMatrix    userfriendlyscience
##   print.confIntOmegaSq       userfriendlyscience
##   print.confIntV             userfriendlyscience
##   print.dataShape            userfriendlyscience
##   print.descr                userfriendlyscience
##   print.ggProportionPlot     userfriendlyscience
##   print.meanConfInt          userfriendlyscience
##   print.multiVarFreq         userfriendlyscience
##   print.normalityAssessment  userfriendlyscience
##   print.regrInfluential      userfriendlyscience
##   print.scaleDiagnosis       userfriendlyscience
##   print.scaleStructure       userfriendlyscience
##   print.scatterMatrix        userfriendlyscience
## 
## Attaching package: 'userfriendlyscience'
## The following objects are masked from 'package:FSA':
## 
##     is.even, is.odd
## The following object is masked from 'package:semTools':
## 
##     reliability
library(stats)
library(foreign) # open SPSS file, I may not use that.
library(gmodels) #For creating histograms with more detail than plot
## 
## Attaching package: 'gmodels'
## The following object is masked from 'package:sjstats':
## 
##     ci
library(stargazer)#For formatting outputs/tables for regression
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(lm.beta) # to isolate the beta co-efficients for regression

#Multinomial regression
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(DescTools)
## Registered S3 method overwritten by 'DescTools':
##   method         from 
##   reorder.factor gdata
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
## 
##     Recode
## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD
library(nnet) #Multinomial regression
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(generalhoslem) #For test of fit for logistic regression, test assumption of linearity
## Loading required package: reshape
## 
## Attaching package: 'reshape'
## The following objects are masked from 'package:reshape2':
## 
##     colsplit, melt, recast
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:rstatix':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
library(Epi) #ROC Curve
library(arm) #for invlogit calculating predicted probabilities
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:reshape':
## 
##     expand
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: lme4
## 
## Attaching package: 'lme4'
## The following object is masked from 'package:Epi':
## 
##     factorize
## The following object is masked from 'package:userfriendlyscience':
## 
##     getData
## 
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is /home/d19125334/Documents/PhD_Modules_2021/ProbilityandStatisiticInference/assignment/assignment2/PSI_Assginment2
## 
## Attaching package: 'arm'
## The following object is masked from 'package:effectsize':
## 
##     standardize
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:psych':
## 
##     logit, rescale, sim
library(regclass) #For confusion matrix
## Loading required package: bestglm
## Loading required package: leaps
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following object is masked from 'package:arm':
## 
##     logit
## The following object is masked from 'package:lmtest':
## 
##     lrtest
## The following object is masked from 'package:tidyr':
## 
##     fill
## The following object is masked from 'package:VIM':
## 
##     wine
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:psych':
## 
##     fisherz, logistic, logit
## Loading required package: rpart
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
## 
## Attaching package: 'regclass'
## The following object is masked from 'package:DescTools':
## 
##     VIF
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
## 
##     cement
## The following object is masked from 'package:datasets':
## 
##     rivers
library(dplyr)
library(broom)
## 
## Attaching package: 'broom'
## The following object is masked from 'package:sjstats':
## 
##     bootstrap
library(ggpubr)

#Dimension Reduction
library(REdaS)
library(Hmisc)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:regclass':
## 
##     qq
## The following object is masked from 'package:userfriendlyscience':
## 
##     oneway
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:DescTools':
## 
##     %nin%, Label, Mean, Quantile
## The following object is masked from 'package:userfriendlyscience':
## 
##     escapeRegex
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(corrplot)
## corrplot 0.84 loaded
## 
## Attaching package: 'corrplot'
## The following object is masked from 'package:arm':
## 
##     corrplot
library(ggcorrplot)
## 
## Attaching package: 'ggcorrplot'
## The following object is masked from 'package:rstatix':
## 
##     cor_pmat
library(factoextra) #Used for principal component analysis to get a different view of eigenvalues
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(nFactors)
## 
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
## 
##     parallel
library("readxl")
data_acperfor <- read_excel("data_academic_performance.xlsx", sheet = "SABER11_SABERPRO")
colnames(data_acperfor) <- tolower(colnames(data_acperfor))

head(data_acperfor)
## # A tibble: 6 x 44
##   cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… F      Incomplet… Complete … Technical… Home       Stratu… It is…
## 2 SB1120… F      Complete … Complete … Entrepren… Independe… Stratu… It is…
## 3 SB1120… M      Not sure   Not sure   Independe… Home       Stratu… Level…
## 4 SB1120… F      Not sure   Not sure   Other occ… Independe… Stratu… It is…
## 5 SB1120… M      Complete … Complete … Executive  Home       Stratu… It is…
## 6 SB1120… F      Complete … Complete … Independe… Executive  Stratu… It is…
## # … with 36 more variables: people_house <chr>, internet <chr>, tv <chr>,
## #   computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## #   fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## #   school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## #   cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## #   university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## #   cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## #   percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## #   sel_ihe <dbl>
summary(data_acperfor)
##    cod_s11             gender           edu_father         edu_mother       
##  Length:12411       Length:12411       Length:12411       Length:12411      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   occ_father         occ_mother          stratum             sisben         
##  Length:12411       Length:12411       Length:12411       Length:12411      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  people_house         internet              tv              computer        
##  Length:12411       Length:12411       Length:12411       Length:12411      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  washing_mch          mic_oven             car                dvd           
##  Length:12411       Length:12411       Length:12411       Length:12411      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     fresh              phone              mobile            revenue         
##  Length:12411       Length:12411       Length:12411       Length:12411      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      job            school_name         school_nat        school_type       
##  Length:12411       Length:12411       Length:12411       Length:12411      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     mat_s11           cr_s11           cc_s11          bio_s11      
##  Min.   : 26.00   Min.   : 24.00   Min.   :  0.00   Min.   : 11.00  
##  1st Qu.: 56.00   1st Qu.: 54.00   1st Qu.: 54.00   1st Qu.: 56.00  
##  Median : 64.00   Median : 61.00   Median : 60.00   Median : 64.00  
##  Mean   : 64.32   Mean   : 60.78   Mean   : 60.71   Mean   : 63.95  
##  3rd Qu.: 72.00   3rd Qu.: 67.00   3rd Qu.: 67.00   3rd Qu.: 71.00  
##  Max.   :100.00   Max.   :100.00   Max.   :100.00   Max.   :100.00  
##     eng_s11        cod_spro          university        academic_program  
##  Min.   : 26.0   Length:12411       Length:12411       Length:12411      
##  1st Qu.: 50.0   Class :character   Class :character   Class :character  
##  Median : 59.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 61.8                                                           
##  3rd Qu.: 72.0                                                           
##  Max.   :100.0                                                           
##      qr_pro           cr_pro          cc_pro          eng_pro     
##  Min.   :  1.00   Min.   :  1.0   Min.   :  1.00   Min.   :  1.0  
##  1st Qu.: 65.00   1st Qu.: 42.0   1st Qu.: 36.00   1st Qu.: 51.0  
##  Median : 85.00   Median : 67.0   Median : 65.00   Median : 74.0  
##  Mean   : 77.42   Mean   : 62.2   Mean   : 59.19   Mean   : 67.5  
##  3rd Qu.: 96.00   3rd Qu.: 86.0   3rd Qu.: 85.00   3rd Qu.: 88.0  
##  Max.   :100.00   Max.   :100.0   Max.   :100.00   Max.   :100.0  
##      wc_pro         fep_pro           g_sc         percentile    
##  Min.   :  0.0   Min.   :  1.0   Min.   : 37.0   Min.   :  1.00  
##  1st Qu.: 28.0   1st Qu.:124.0   1st Qu.:147.0   1st Qu.: 51.00  
##  Median : 56.0   Median :153.0   Median :163.0   Median : 75.00  
##  Mean   : 53.7   Mean   :145.5   Mean   :162.7   Mean   : 68.45  
##  3rd Qu.: 80.0   3rd Qu.:174.0   3rd Qu.:179.0   3rd Qu.: 90.00  
##  Max.   :100.0   Max.   :300.0   Max.   :247.0   Max.   :100.00  
##    2nd_decile       quartile          sel           sel_ihe     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :4.000   Median :4.000   Median :2.000   Median :2.000  
##  Mean   :3.886   Mean   :3.189   Mean   :2.599   Mean   :2.409  
##  3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :4.000   Max.   :4.000   Max.   :4.000

set up a quantitiative research question

What is a relationship for those male and female engineering students between the overall average score of the professional evaluation with formulation of engineering projects and mathematics in Colombia.

Data Explorsion

Dependent variable: g_sc: the overall average score of the professional evaluation
Independent variables: 1. fep_pro: Formulation of Engineering Projects 2. mat_s11 (Mathematics: only one subject for the two periods, check whether it impacts the global scores g_sc.) 3. gender (binary: F/M) —- before need to check whether differences

*** add interaction term to explore more: fep_pro*mat_s11 as interaction variable for model 3

# statistics descpritve
# g_sc summary statistics
pastecs::stat.desc(data_acperfor$g_sc, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##  163.0000000  162.7104988    0.2074642    0.4066620  534.1866899   23.1124791 
##     coef.var 
##    0.1420466
summary(data_acperfor$g_sc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    37.0   147.0   163.0   162.7   179.0   247.0
# Analyze Normality --- Dependent variable: g_sc
gg_gsc <- ggplot(data_acperfor, aes(x=g_sc)) +
  labs(x="g_sc") +
  ggtitle("Figure 1 - Histogram for Normalised g_sc") +
  geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..)) +
  scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C") + 
  stat_function(fun=dnorm, color="red",args=list(mean=mean(data_acperfor$g_sc, na.rm=TRUE), sd=sd(data_acperfor$g_sc, na.rm=TRUE)))

gg_gsc

qqnorm(data_acperfor$g_sc, main="Figure 2 - QQ Plot for Normalised g_sc")
qqline(data_acperfor$g_sc, col=2)

#skewness and kurtosis
tpskew <- semTools::skew(data_acperfor$g_sc)
tpkurt <- semTools::kurtosis(data_acperfor$g_sc)


tpskew[1]/tpskew[2]
## skew (g1) 
## -4.339492
tpkurt[1]/tpkurt[2]
## Excess Kur (g2) 
##       -1.714035
gsc<- abs(scale(data_acperfor$g_sc))
FSA::perc(as.numeric(gsc), 1.96, "gt")
## [1] 4.060914
FSA::perc(as.numeric(gsc), 3.29, "gt")
## [1] 0.15309
# statistics descpritve
# mat_s11: summary statistics
pastecs::stat.desc(data_acperfor$mat_s11, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   64.0000000   64.3207638    0.1065813    0.2089158  140.9835648   11.8736500 
##     coef.var 
##    0.1846006
summary(data_acperfor$mat_s11)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.00   56.00   64.00   64.32   72.00  100.00
# Analyze Normality  --- independent variable: mat_s11
gg_fegpro <- ggplot(data_acperfor, aes(x=mat_s11)) +
  labs(x="mat_s11") +
  ggtitle("Figure 3 - Histogram for Normalised mat_s11") +
  geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..)) +
  scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C") + 
  stat_function(fun=dnorm, color="red",args=list(mean=mean(data_acperfor$mat_s11, na.rm=TRUE), sd=sd(data_acperfor$mat_s11, na.rm=TRUE)))

gg_fegpro

qqnorm(data_acperfor$mat_s11, main="Figure 4 - QQ Plot for Normalised mat_s11")
qqline(data_acperfor$mat_s11, col=2)

#skewness and kurtosis
tpskew <- semTools::skew(data_acperfor$mat_s11)
tpkurt <- semTools::kurtosis(data_acperfor$mat_s11)


tpskew[1]/tpskew[2]
## skew (g1) 
##  18.17215
tpkurt[1]/tpkurt[2]
## Excess Kur (g2) 
##        2.954064
mats11<- abs(scale(data_acperfor$mat_s11))
FSA::perc(as.numeric(mats11), 1.96, "gt")
## [1] 4.447667
FSA::perc(as.numeric(mats11), 3.29, "gt")
## [1] 0
# statistics descpritve
# fep_pro summary statistics
pastecs::stat.desc(data_acperfor$fep_pro, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##  153.0000000  145.4765933    0.3601859    0.7060202 1610.1268292   40.1263857 
##     coef.var 
##    0.2758271
summary(data_acperfor$fep_pro)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   124.0   153.0   145.5   174.0   300.0
# Analyze Normality  --- independent variable: fep_pro
gg_fegpro <- ggplot(data_acperfor, aes(x=fep_pro)) +
  labs(x="fep_pro") +
  ggtitle("Figure 5 - Histogram for Normalised fep_pro") +
  geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..)) +
  scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C") + 
  stat_function(fun=dnorm, color="red",args=list(mean=mean(data_acperfor$fep_pro, na.rm=TRUE), sd=sd(data_acperfor$fep_pro, na.rm=TRUE)))

gg_fegpro

qqnorm(data_acperfor$fep_pro, main="Figure 6 - QQ Plot for Normalised fep_pro")
qqline(data_acperfor$fep_pro, col=2)

#skewness and kurtosis
tpskew <- semTools::skew(data_acperfor$fep_pro)
tpkurt <- semTools::kurtosis(data_acperfor$fep_pro)


tpskew[1]/tpskew[2]
## skew (g1) 
##  -38.2767
tpkurt[1]/tpkurt[2]
## Excess Kur (g2) 
##        16.28851
feppro<- abs(scale(data_acperfor$fep_pro))
FSA::perc(as.numeric(feppro), 1.96, "gt")
## [1] 4.632987
FSA::perc(as.numeric(feppro), 3.29, "gt")
## [1] 0.5076142
# checking whether data is balance, gender

summary(as.factor(data_acperfor$gender))
##    F    M 
## 5043 7368
#for female variable 
female_var = table(data_acperfor$gender)["F"]/(table(data_acperfor$gender)["F"]+table(data_acperfor$gender)["M"])
male_var = table(data_acperfor$gender)["M"]/(table(data_acperfor$gender)["F"]+table(data_acperfor$gender)["M"])
print(female_var) #0.41
##         F 
## 0.4063331
print(male_var)#0.59
##         M 
## 0.5936669
# missing data
#1. visualise the messing data level and pattern
varsint <- c("fep_pro", "mat_s11", "g_sc", "gender")
acperfor_subset <- data_acperfor[varsint]
summary(acperfor_subset)
##     fep_pro         mat_s11            g_sc          gender         
##  Min.   :  1.0   Min.   : 26.00   Min.   : 37.0   Length:12411      
##  1st Qu.:124.0   1st Qu.: 56.00   1st Qu.:147.0   Class :character  
##  Median :153.0   Median : 64.00   Median :163.0   Mode  :character  
##  Mean   :145.5   Mean   : 64.32   Mean   :162.7                     
##  3rd Qu.:174.0   3rd Qu.: 72.00   3rd Qu.:179.0                     
##  Max.   :300.0   Max.   :100.00   Max.   :247.0
#Create and inspect patterns of missingness
res<-summary(VIM::aggr(acperfor_subset, sortVar=TRUE))$combinations

## 
##  Variables sorted by number of missings: 
##  Variable Count
##   fep_pro     0
##   mat_s11     0
##      g_sc     0
##    gender     0
### result: no missing data
# correlation between fep_pro and g_sc
# show scatterplot of g_sc (y) and fep_pro (x)
scat_fepsc <- ggplot2::ggplot(data_acperfor, aes(fep_pro, g_sc)) 

#Add a regression line
scat_fepsc + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "fep_pro", y = "Normalised g_sc") 
## `geom_smooth()` using formula 'y ~ x'

#Pearson Correlation
stats::cor.test(data_acperfor$fep_pro, data_acperfor$g_sc, method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  data_acperfor$fep_pro and data_acperfor$g_sc
## t = 44.872, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3584055 0.3886814
## sample estimates:
##      cor 
## 0.373643
####################################################################################################
# correlation between mat_s11 and g_sc
#show scatterplot of g_sc (y) and  mat_s11 (x)
scat_fepsc <- ggplot2::ggplot(data_acperfor, aes(mat_s11, g_sc))
#Add a regression line
scat_fepsc + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "mat_s11", y = "Normalised g_sc") 
## `geom_smooth()` using formula 'y ~ x'

#Pearson Correlation
stats::cor.test(data_acperfor$mat_s11, data_acperfor$g_sc, method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  data_acperfor$mat_s11 and data_acperfor$g_sc
## t = 93.733, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6334194 0.6540231
## sample estimates:
##       cor 
## 0.6438379

notes: 1. correlation with fep_pro and g_sc is weak positive correlation (r = 0.37) 2. correlation with mat_s11 and g_sc is moderate positive correlation (r=0.64)

# checking the difference g_sc by gender
# add gender to investigate a differential effect
# independent t-test 
psych::describeBy(data_acperfor$g_sc, data_acperfor$gender, mat=TRUE)
##     item group1 vars    n     mean       sd median  trimmed     mad min max
## X11    1      F    1 5043 161.2782 22.33294    162 161.4994 23.7216  76 242
## X12    2      M    1 7368 163.6908 23.58262    164 163.9561 25.2042  37 247
##     range        skew    kurtosis        se
## X11   166 -0.07216008 -0.24717975 0.3144861
## X12   210 -0.12210492  0.01441897 0.2747370
# Levene's test for homogeneity of variable
car::leveneTest(g_sc ~ gender, data=data_acperfor)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value   Pr(>F)    
## group     1  13.115 0.000294 ***
##       12409                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# t test
res <- stats::t.test(g_sc ~ gender, var.equal=TRUE, data=data_acperfor)
res
## 
##  Two Sample t-test
## 
## data:  g_sc by gender
## t = -5.7189, df = 12409, p-value = 1.097e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.239544 -1.585691
## sample estimates:
## mean in group F mean in group M 
##        161.2782        163.6908
# Cohen's d 
# 0.2=small effect, 0.5=moderate, 0.8 = large
effsize_gender = round((2*res$statistic)/sqrt(res$parameter), 2)
effsize_gender
##    t 
## -0.1
effectsize::t_to_d(t=res$statistic, res$parameter)
##     d |         95% CI
## ----------------------
## -0.10 | [-0.14, -0.07]
# Eta
# reporting guideline: on effect size: 0.01 = small, 0.06 = moderate, 0.14 =large
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes
##     t 
## 0.003

Report this t-test result:

1. Reporting the results with Cohen’s d effect
An independent-samples t-test was conducted to the overall average score of the professional evaluation (g_sc) for female and male students. There is a slight Significant difference in the scores for the overall average score of the professional evaluation  was found (M=161.28, SD=22.33 for female students, M=163.69, SD=23.58 for male students), t(12409)=-5.72, p-value < 0.001). Cohen's d also indicated a small effect size (-0.10). 

2. Reporting the results with eta squared effect
An independent-samples t-test was conducted to compare to the overall average score of the professional evaluation (g_sc) for female and male students. There is a slight Significant difference in the scores for the overall average score of the professional evaluation was found (M=161.28, SD=22.33 for female students, M=163.69, SD=23.58 for male students), t(12409)=-5.72, p-value < 0.001). A small effect size was also indicated by the eta squared value (0.003).

Build Linear regression

# MLR_Model1
# independent variable: fep_pro, mat_s11
# dependent variable: g_sc

mmodel_1 <- lm(data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11)
anova(mmodel_1)
## Analysis of Variance Table
## 
## Response: data_acperfor$g_sc
##                          Df  Sum Sq Mean Sq F value    Pr(>F)    
## data_acperfor$fep_pro     1  925504  925504  3206.9 < 2.2e-16 ***
## data_acperfor$mat_s11     1 2122802 2122802  7355.5 < 2.2e-16 ***
## Residuals             12408 3580950     289                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n *******Summary mulit-model*******\n") 
## 
##  *******Summary mulit-model*******
summary(mmodel_1)
## 
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.225  -10.994    0.436   11.235   78.513 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           70.836419   0.909734   77.86   <2e-16 ***
## data_acperfor$fep_pro  0.127002   0.003937   32.26   <2e-16 ***
## data_acperfor$mat_s11  1.141128   0.013305   85.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.99 on 12408 degrees of freedom
## Multiple R-squared:  0.4598, Adjusted R-squared:  0.4597 
## F-statistic:  5281 on 2 and 12408 DF,  p-value: < 2.2e-16
lm.beta::lm.beta(mmodel_1)
## 
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11)
## 
## Standardized Coefficients::
##           (Intercept) data_acperfor$fep_pro data_acperfor$mat_s11 
##             0.0000000             0.2204928             0.5862357
stargazer(mmodel_1, type="text")
## 
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                                 g_sc            
## ------------------------------------------------
## fep_pro                       0.127***          
##                               (0.004)           
##                                                 
## mat_s11                       1.141***          
##                               (0.013)           
##                                                 
## Constant                     70.836***          
##                               (0.910)           
##                                                 
## ------------------------------------------------
## Observations                   12,411           
## R2                             0.460            
## Adjusted R2                    0.460            
## Residual Std. Error     16.988 (df = 12408)     
## F Statistic         5,281.195*** (df = 2; 12408)
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
plotg_sc <- ggplot(data_acperfor, aes(x=mat_s11, y=g_sc)) +
            theme_bw() +
           geom_point(colour="grey") +
           geom_line(data=plotting.data, aes(x=mat_s11, y=predicted.y, colour=fep_pro), size=1) +
           geom_smooth(method = "lm", colour = "red", se = F) + labs(x = "mat_s11", y = "g_sc") +
          labs(title = "Model 1 Global Score \n as a function of Mathmatics \n and the score of formulation of Engineering Projects",
              x = "Mathmatics",
              y = "The average score of the professional evaluation",
              color = "fep_pro")
plotg_sc
## `geom_smooth()` using formula 'y ~ x'

#dummy code gender to be 0 and 1 as we want by adding a new variable gender to the dataset which recodes gender
data_acperfor$sex=ifelse(data_acperfor$gender == "M", 0, ifelse(data_acperfor$gender == "F", 1, NA))
# R automatically recodes categorical to be dummy variable 0 = reference (females), 1 category of interest (males)
mmodel_2 <- lm(data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11 +data_acperfor$sex)
cat("\n *******Summary mulit-model*******\n") 
## 
##  *******Summary mulit-model*******
summary(mmodel_2)
## 
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11 + 
##     data_acperfor$sex)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -146.526  -10.944    0.421   11.343   77.319 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           69.004330   0.942286  73.231  < 2e-16 ***
## data_acperfor$fep_pro  0.125806   0.003932  31.992  < 2e-16 ***
## data_acperfor$mat_s11  1.157898   0.013477  85.915  < 2e-16 ***
## data_acperfor$sex      2.282554   0.314493   7.258 4.17e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.95 on 12407 degrees of freedom
## Multiple R-squared:  0.4621, Adjusted R-squared:  0.462 
## F-statistic:  3553 on 3 and 12407 DF,  p-value: < 2.2e-16
anova(mmodel_2)
## Analysis of Variance Table
## 
## Response: data_acperfor$g_sc
##                          Df  Sum Sq Mean Sq  F value    Pr(>F)    
## data_acperfor$fep_pro     1  925504  925504 3220.231 < 2.2e-16 ***
## data_acperfor$mat_s11     1 2122802 2122802 7386.150 < 2.2e-16 ***
## data_acperfor$sex         1   15140   15140   52.677 4.166e-13 ***
## Residuals             12407 3565811     287                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.beta::lm.beta(mmodel_2)
## 
## Call:
## lm(formula = data_acperfor$g_sc ~ data_acperfor$fep_pro + data_acperfor$mat_s11 + 
##     data_acperfor$sex)
## 
## Standardized Coefficients::
##           (Intercept) data_acperfor$fep_pro data_acperfor$mat_s11 
##            0.00000000            0.21841594            0.59485077 
##     data_acperfor$sex 
##            0.04850701
stargazer(mmodel_2, type="text")
## 
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                                 g_sc            
## ------------------------------------------------
## fep_pro                       0.126***          
##                               (0.004)           
##                                                 
## mat_s11                       1.158***          
##                               (0.013)           
##                                                 
## sex                           2.283***          
##                               (0.314)           
##                                                 
## Constant                     69.004***          
##                               (0.942)           
##                                                 
## ------------------------------------------------
## Observations                   12,411           
## R2                             0.462            
## Adjusted R2                    0.462            
## Residual Std. Error     16.953 (df = 12407)     
## F Statistic         3,553.019*** (df = 3; 12407)
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
plotg_sc2 <- ggplot(data_acperfor, aes(x=mat_s11, y=g_sc)) +
            theme_bw() +
           geom_point(colour="grey") +
           geom_line(data=plotting.data2, aes(x=mat_s11, y=predicted.y, colour=sex), size=1.25) +
           geom_smooth(method = "lm", colour = "red", se = F) + labs(x = "mat_s11", y = "g_sc") +
          labs(title = "Model 2 Global Score \n as a function of Mathmatics \n and the score of formulation of Engineering Projects by gender",
              x = "Mathmatics",
              y = "The average score of the professional evaluation",
              color = "Gender")
plotg_sc2
## `geom_smooth()` using formula 'y ~ x'

stargazer(mmodel_1, mmodel_2, type="text")
## 
## =============================================================================
##                                        Dependent variable:                   
##                     ---------------------------------------------------------
##                                               g_sc                           
##                                 (1)                          (2)             
## -----------------------------------------------------------------------------
## fep_pro                       0.127***                     0.126***          
##                               (0.004)                      (0.004)           
##                                                                              
## mat_s11                       1.141***                     1.158***          
##                               (0.013)                      (0.013)           
##                                                                              
## sex                                                        2.283***          
##                                                            (0.314)           
##                                                                              
## Constant                     70.836***                    69.004***          
##                               (0.910)                      (0.942)           
##                                                                              
## -----------------------------------------------------------------------------
## Observations                   12,411                       12,411           
## R2                             0.460                        0.462            
## Adjusted R2                    0.460                        0.462            
## Residual Std. Error     16.988 (df = 12408)          16.953 (df = 12407)     
## F Statistic         5,281.195*** (df = 2; 12408) 3,553.019*** (df = 3; 12407)
## =============================================================================
## Note:                                             *p<0.1; **p<0.05; ***p<0.01
# investigating partial correlation: two variables while controlling for another
varsmodel <- c("g_sc", "sex", "fep_pro", "mat_s11")
omitdata <- na.omit(data_acperfor[varsmodel])
ppcor::spcor.test(omitdata$g_sc, omitdata$mat_s11, omitdata$fep_pro) 
##    estimate p.value statistic     n gp  Method
## 1 0.5658774       0  76.45188 12411  1 pearson
#zero order correlations
cor(omitdata$g_sc, omitdata$fep_pro)
## [1] 0.373643
cor(omitdata$g_sc, omitdata$mat_s11)
## [1] 0.6438379
cor(omitdata$mat_s11, omitdata$fep_pro)
## [1] 0.2612434

Reporting Results of partical correlation

Partial correlation was used to explore the relationship between the global score (as measured by g_sc) and mathmatics while controlling for scores on formulation of engineering projects (fep_pro).Preliminary analyses were performed to ensure no violation of the assumption of normality, linearity and homoscedasticity. There was a stong, positive partial correlatin between, the global scores, mathmatics and the scores on formulation of engineering projects, (r=0.57, n=12,411, p<0.001). An inspection of the zero-order correlation (r=0.26) suggested that the scores on formulation of engineering projects had very little effect on the strength of the relationship between these two variables.
# add interaction item
data_acperfor$interaction <- data_acperfor$fep_pro * data_acperfor$mat_s11
mmodel_3 <- lm(omitdata$g_sc ~ omitdata$sex + omitdata$mat_s11 + data_acperfor$interaction)
summary(mmodel_3)
## 
## Call:
## lm(formula = omitdata$g_sc ~ omitdata$sex + omitdata$mat_s11 + 
##     data_acperfor$interaction)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -146.748  -10.995    0.406   11.360   78.377 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               8.686e+01  9.147e-01  94.962  < 2e-16 ***
## omitdata$sex              2.332e+00  3.159e-01   7.382 1.66e-13 ***
## omitdata$mat_s11          9.064e-01  1.787e-02  50.729  < 2e-16 ***
## data_acperfor$interaction 1.751e-03  5.843e-05  29.975  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.03 on 12407 degrees of freedom
## Multiple R-squared:  0.4571, Adjusted R-squared:  0.4569 
## F-statistic:  3481 on 3 and 12407 DF,  p-value: < 2.2e-16
anova(mmodel_3)
## Analysis of Variance Table
## 
## Response: omitdata$g_sc
##                              Df  Sum Sq Mean Sq F value    Pr(>F)    
## omitdata$sex                  1   17426   17426   60.07 9.867e-15 ***
## omitdata$mat_s11              1 2751869 2751869 9485.83 < 2.2e-16 ***
## data_acperfor$interaction     1  260653  260653  898.49 < 2.2e-16 ***
## Residuals                 12407 3599308     290                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.beta::lm.beta(mmodel_3)
## 
## Call:
## lm(formula = omitdata$g_sc ~ omitdata$sex + omitdata$mat_s11 + 
##     data_acperfor$interaction)
## 
## Standardized Coefficients::
##               (Intercept)              omitdata$sex          omitdata$mat_s11 
##                0.00000000                0.04956147                0.46562876 
## data_acperfor$interaction 
##                0.27229709
stargazer(mmodel_3, type="text")
## 
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                                 g_sc            
## ------------------------------------------------
## sex                           2.332***          
##                               (0.316)           
##                                                 
## mat_s11                       0.906***          
##                               (0.018)           
##                                                 
## interaction                   0.002***          
##                               (0.0001)          
##                                                 
## Constant                     86.858***          
##                               (0.915)           
##                                                 
## ------------------------------------------------
## Observations                   12,411           
## R2                             0.457            
## Adjusted R2                    0.457            
## Residual Std. Error     17.032 (df = 12407)     
## F Statistic         3,481.463*** (df = 3; 12407)
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
plotg_sc3 <- ggplot(data_acperfor, aes(x=mat_s11, y=g_sc)) +
            theme_bw() +
           geom_point(colour="grey") +
           geom_line(data=plotting.data3, aes(x=mat_s11, y=predicted.y, colour=sex), size=1) +
           geom_smooth(method = "lm", colour = "red", se = F) + labs(x = "mat_s11", y = "g_sc") +
          labs(title = "Model 3 Global Score \n as a function of Mathmatics \n and the score of formulation of engineering projects \n with interaction by gender",
              x = "Mathmatics",
              y = "The average score of the professional evaluation",
              color = "Gender")
plotg_sc3
## `geom_smooth()` using formula 'y ~ x'

stargazer(mmodel_1, mmodel_2, mmodel_3, type="text")
## 
## ==========================================================================================================
##                                                      Dependent variable:                                  
##                     --------------------------------------------------------------------------------------
##                                               g_sc                                        g_sc            
##                                 (1)                          (2)                          (3)             
## ----------------------------------------------------------------------------------------------------------
## fep_pro                       0.127***                     0.126***                                       
##                               (0.004)                      (0.004)                                        
##                                                                                                           
## mat_s11                       1.141***                     1.158***                                       
##                               (0.013)                      (0.013)                                        
##                                                                                                           
## sex                                                        2.283***                                       
##                                                            (0.314)                                        
##                                                                                                           
## sex                                                                                     2.332***          
##                                                                                         (0.316)           
##                                                                                                           
## mat_s11                                                                                 0.906***          
##                                                                                         (0.018)           
##                                                                                                           
## interaction                                                                             0.002***          
##                                                                                         (0.0001)          
##                                                                                                           
## Constant                     70.836***                    69.004***                    86.858***          
##                               (0.910)                      (0.942)                      (0.915)           
##                                                                                                           
## ----------------------------------------------------------------------------------------------------------
## Observations                   12,411                       12,411                       12,411           
## R2                             0.460                        0.462                        0.457            
## Adjusted R2                    0.460                        0.462                        0.457            
## Residual Std. Error     16.988 (df = 12408)          16.953 (df = 12407)          17.032 (df = 12407)     
## F Statistic         5,281.195*** (df = 2; 12408) 3,553.019*** (df = 3; 12407) 3,481.463*** (df = 3; 12407)
## ==========================================================================================================
## Note:                                                                          *p<0.1; **p<0.05; ***p<0.01

Linear Regression-Assumptions

Model 1 g_sc ~ fep_pro + mat_s11

stdres_mmolde1 = rstandard(mmodel_1)
summary(stdres_mmolde1)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -8.667304 -0.647218  0.025666  0.000007  0.661424  4.622050
pastecs::stat.desc(stdres_mmolde1, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 2.566595e-02 7.426034e-06 8.976692e-03 1.759571e-02 1.000091e+00 1.000045e+00 
##     coef.var 
## 1.346675e+05
  1. if standardized residual = [-3.29, 3.29] then no outliers, otherwise, there are outliers [-8.67, 4.62]
#Influential observatons
# An observation that changes the slope of the line. 
# They have a large influence on the fit of the model. 

#influntial Outlier - cook's distance
cooksd_mod1 <- sort(cooks.distance(mmodel_1))

#plot Cook's distance, show the values that are greater than one
plot(cooksd_mod1, pch="*", cex=2, main="Influential observations by Cooks distance for Model 1")
abline(h=4*mean(cooksd_mod1, na.rm=T), col="red") #cutoff line
text(x=1:length(cooksd_mod1)+1, y=cooksd_mod1, labels=ifelse(cooksd_mod1>4*mean(cooksd_mod1, na.rom=T), names(cooksd_mod1), ""), col="red")

#find rows related to influential observations
influential_model1 <- as.numeric(names(cooksd_mod1)[(cooksd_mod1 > 4*mean(cooksd_mod1, na.rm=T))])  # influential row numbers
stem(influential_model1)
## 
##   The decimal point is 3 digit(s) to the right of the |
## 
##    0 | 00011111222233444555666777888899
##    1 | 0011111111122233333334444555556677788899999
##    2 | 00000000011111111111111122222223333333444455555555666666677777778888
##    3 | 0011111222333344444455555566667777778888888899999
##    4 | 00011112223333333444455556667777888899999
##    5 | 0000011112222223333333333333344445555666666666777788888899999
##    6 | 00000011111111222222222333344444444455555555566677777777888888888999
##    7 | 000000011111222223344444444555666677777778888888999999
##    8 | 000001112222233333344444445555555677778888899
##    9 | 000000011122222222333333333344444445556666666667777778888999
##   10 | 000000011111333344445555666666667778899999
##   11 | 00000111122222222223333344444555555666677788888899999
##   12 | 000111111111122222333333333444444
head(data_acperfor[influential_model1, ])  # influential observations.
## # A tibble: 6 x 46
##   cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… F      Complete … Complete … Auxiliary… Auxiliary… Stratu… It is…
## 2 SB1120… F      Complete … Complete … 0          Technical… Stratu… It is…
## 3 SB1120… M      Complete … Incomplet… Independe… Home       Stratu… Level…
## 4 SB1120… M      Complete … Complete … Other occ… Auxiliary… Stratu… Level…
## 5 SB1120… M      Complete … Complete … Technical… Home       Stratu… It is…
## 6 SB1120… F      Incomplet… Incomplet… Operator   Home       Stratu… It is…
## # … with 38 more variables: people_house <chr>, internet <chr>, tv <chr>,
## #   computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## #   fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## #   school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## #   cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## #   university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## #   cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## #   percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## #   sel_ihe <dbl>, sex <dbl>, interaction <dbl>
head(data_acperfor[influential_model1, ]$fep_pro)  # look at the values of fep_pro
## [1] 211 221  79 210 145  93
head(data_acperfor[influential_model1, ]$mat_s11)
## [1]  82  94  65  69 100  70
car::outlierTest(mmodel_1)
##       rstudent unadjusted p-value Bonferroni p
## 1336 -8.693310         3.9597e-18   4.9144e-14
## 7721 -5.687573         1.3177e-08   1.6354e-04
## 6920  4.625847         3.7684e-06   4.6770e-02
# Bonferonni p-value for most extreme obs 
# Are there any cases where the outcome variable has an unusual variable for its predictor values?

Report outliers for model 1: An analysis of standard residuals was carried out on the data to identify any outliers, which indicated three cases for concern which were deleted.

# leverage points: An observation that has a value of x that is far away from the mean of x
car::leveragePlots(mmodel_1)

#Assess homoscedasticity: ZRESID vs ZPRED
plot(mmodel_1,1)

#from the homoscedasticity plot: assumptions met
plot(mmodel_1,3)

# Normality of residuals
# create histogram and density plot of the residuals
plot(density(resid(mmodel_1)))

#create QQ plot for standanside residuals
car::qqPlot(mmodel_1, main="Normal QQ plot of Regression Standardized Residual for mmodel_1")

## [1] 1336 7721

Report for Random Normal Distribution of errors for model 1:

The histogram of standardized residuals indicated that the data contained normally distributed errors, as did the normal Q-Q plot of standardized residuals, which showed most points were extremely close to the line.

# non-zero variances
#pastecs::stat.desc(mmodel_3, basic=F)
varsmodel <- c("fep_pro", "mat_s11")
model_predictor <- data_acperfor[varsmodel]
pastecs::stat.desc(model_predictor, basic=F)
##                   fep_pro     mat_s11
## median        153.0000000  64.0000000
## mean          145.4765933  64.3207638
## SE.mean         0.3601859   0.1065813
## CI.mean.0.95    0.7060202   0.2089158
## var          1610.1268292 140.9835648
## std.dev        40.1263857  11.8736500
## coef.var        0.2758271   0.1846006

Non-Zero Report for model 1: The data also met the assumption of non-zero variances (Formulation of Engineering Projects = 1610.13; Mathematics = 140.98)

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Collinearity Note: Occurs when two or more independent variables contain strongly redundant information. If variables are collinear then it means there is not enough distinct information in these variables for MLR to operate – they are essentially measuring the same thing. if we conduct MLR with collinear variables then the model will produce invalid results Need to check for collinearity by examining a correlation matrix that compares your independent variables with each other.

#Collinearity: above 0.8: might be present.
# vif > 2.5 or tolenance < 0.4 might be multicollinearity.
vif_mm1 <- car::vif(mmodel_1)
vif_mm1
## data_acperfor$fep_pro data_acperfor$mat_s11 
##              1.073247              1.073247
#Calculate tolerance
1/vif_mm1
## data_acperfor$fep_pro data_acperfor$mat_s11 
##             0.9317519             0.9317519

the result show no Multicollinearity. vif: 1.07 < 2.5, tolerance 0.93 > 0.4

Report for colliearity for model 1 Tests to see if the data met the assumption of collinearity indicated that multicollinearity was not a concert(Formulation of Engineering Projects, Tolerance=0.93, VIF=1.07; Mathematics, Tolerance=0.93, VIF=1.07) +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Model 2 g_sc ~ fep_pro + mat_s11 group by sex

(dummy code, F-1, M-0)

stdres_mmolde2 = rstandard(mmodel_2)
summary(stdres_mmolde2)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -8.644256 -0.645643  0.024830  0.000008  0.669164  4.561478
pastecs::stat.desc(stdres_mmolde2, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 2.482968e-02 7.639327e-06 8.976679e-03 1.759568e-02 1.000088e+00 1.000044e+00 
##     coef.var 
## 1.309073e+05

if standardized residual = [-3.29, 3.29] then no outliers, otherwise, there are outliers [-8.64, 4.56]

#Influential observatons
# An observation that changes the slope of the line. 
# They have a large influence on the fit of the model. 

#influntial Outlier - cook's distance
cooksd_mod2 <- sort(cooks.distance(mmodel_2))

#plot Cook's distance, show the values that are greater than one
plot(cooksd_mod2, pch="*", cex=2, main="Influential observations by Cooks distance for Model 2")
abline(h=4*mean(cooksd_mod2, na.rm=T), col="red") #cutoff line
text(x=1:length(cooksd_mod2)+1, y=cooksd_mod2, labels=ifelse(cooksd_mod2>4*mean(cooksd_mod2, na.rom=T), names(cooksd_mod2), ""), col="red")

#find rows related to influential observations
influential_model2 <- as.numeric(names(cooksd_mod2)[(cooksd_mod2 > 4*mean(cooksd_mod2, na.rm=T))])  # influential row numbers
stem(influential_model2)
## 
##   The decimal point is 3 digit(s) to the right of the |
## 
##    0 | 00111112222333444555666677788888899
##    1 | 001111111112223333333344444555556677788899999
##    2 | 00000000000011111111111111122222233333333444455555566666667777778888
##    3 | 000111112233333444444455556666777777788888888999
##    4 | 0001111222333333444455556667777888899999
##    5 | 00001111222223333333333333444455555666666667777788888889999
##    6 | 0000001111111222222222333344444455555556677777777788888888999999
##    7 | 0000001111122222334444444445566667777778888899999
##    8 | 00001112222333334444444555555556777888889999
##    9 | 000000011112222222233333333344444555666666666677777778888999
##   10 | 00000001111333334444455555666666666777788899999
##   11 | 000001112222222223333344444555555566667788888999
##   12 | 0001111111112222222333333334444444
head(data_acperfor[influential_model2, ])  # influential observations.
## # A tibble: 6 x 46
##   cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… F      Incomplet… Incomplet… 0          0          Stratu… Esta …
## 2 SB1120… M      Postgradu… Postgradu… Independe… Technical… Stratu… It is…
## 3 SB1120… M      Complete … Complete … Executive  Home       Stratu… It is…
## 4 SB1120… F      Complete … Complete … Operator   Independe… Stratu… Level…
## 5 SB1120… M      Complete … Complete … Technical… Home       Stratu… It is…
## 6 SB1120… M      Complete … Complete … Technical… Home       Stratu… It is…
## # … with 38 more variables: people_house <chr>, internet <chr>, tv <chr>,
## #   computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## #   fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## #   school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## #   cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## #   university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## #   cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## #   percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## #   sel_ihe <dbl>, sex <dbl>, interaction <dbl>
head(data_acperfor[influential_model2, ]$fep_pro)  # look at the values of fep_pro
## [1] 147 210 206  74  98  92
head(data_acperfor[influential_model2, ]$mat_s11)
## [1]  52 100  68  76  61  53
head(data_acperfor[influential_model2, ]$sex)
## [1] 1 0 0 1 0 0
car::outlierTest(mmodel_2)
##       rstudent unadjusted p-value Bonferroni p
## 1336 -8.670056         4.8523e-18   6.0222e-14
## 7721 -5.652291         1.6181e-08   2.0083e-04
# Bonferonni p-value for most extreme obs 
# Are there any cases where the outcome variable has an unusual variable for its predictor values?

Report outliers for model 2: An analysis of standard residuals was carried out on the data to identify any outliers, which indicated two cases for concern which were deleted.

# leverage points: An observation that has a value of x that is far away from the mean of x
car::leveragePlots(mmodel_2)

#Assess homocedasticity: ZRESID y vs ZPRED x
# ZPRED (the standardized predicted values of the dependent variable based on the model). These values are
# standardized forms of the values predicted by the model.

# ZRESID (the standardized residuals, or errors). These values are the standardized differences between the
# observed data and the values that the model predicts).

plot(mmodel_2,1)

#from the homcedasticity plot: assumptions met
plot(mmodel_2,3)

# Normality of residuals
# create histogram and density plot of the residuals
plot(density(resid(mmodel_2)))

#create QQ plot for standanside residuals
car::qqPlot(mmodel_2, main="Normal QQ plot \n of Regression Standardized Residual for mmodel_2")

## [1] 1336 7721

Report for Random Normal Distribution of errors for model 2: The histogram of standardized residuals indicated that the data contained normally distributed errors, as did the normal Q-Q plot of standardized residuals, which showed most points were extremely close to the line.

# non-zero variances
varsmodel <- c("fep_pro", "mat_s11","sex")
model_predictor <- data_acperfor[varsmodel]
pastecs::stat.desc(model_predictor, basic=F)
##                   fep_pro     mat_s11         sex
## median        153.0000000  64.0000000 0.000000000
## mean          145.4765933  64.3207638 0.406333092
## SE.mean         0.3601859   0.1065813 0.004408863
## CI.mean.0.95    0.7060202   0.2089158 0.008642056
## var          1610.1268292 140.9835648 0.241245948
## std.dev        40.1263857  11.8736500 0.491167943
## coef.var        0.2758271   0.1846006 1.208781547

Non-Zero Report for model 2: The data also met the assumption of non-zero variances (Formulation of Engineering Projects = 1610.13; Mathematics = 140.98, sex=0.24)

#Collinearity: above 0.8: might be present.
# vif > 2.5 or tolenance < 0.4 might be multicollinearity.
vif_mm2 <- car::vif(mmodel_2)
vif_mm2
## data_acperfor$fep_pro data_acperfor$mat_s11     data_acperfor$sex 
##              1.075136              1.105746              1.030295
#Calculate tolerance
1/vif_mm2
## data_acperfor$fep_pro data_acperfor$mat_s11     data_acperfor$sex 
##             0.9301151             0.9043665             0.9705962

the result show no Multicollinearity. vif: 1.08, 1.11, 1.03 < 2.5, tolerance 0.93, 0.90, 0.97 > 0.4

Report for colliearity of model 2: Tests to see if the data met the assumption of collinearity indicated that multicollinearity was not a concert(Formulation of Engineering Projects, Tolerance=0.93, VIF=1.08; Mathematics, Tolerance=0.90, VIF=1.11; Sex, Tolerance=0.97, VIF=1.03). +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Model 3 g_sc ~ mat_s11 group by sex + fep_promat_s11 (dummy code, F-1, M-0) interaction item = fep_pro mat_s11

stdres_mmolde3 = rstandard(mmodel_3)
summary(stdres_mmolde3)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -8.616957 -0.645600  0.023838  0.000005  0.667170  4.602219
pastecs::stat.desc(stdres_mmolde3, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 2.383789e-02 5.266550e-06 8.976672e-03 1.759567e-02 1.000086e+00 1.000043e+00 
##     coef.var 
## 1.898858e+05

if standardized residual = [-3.29, 3.29] then no outliers, otherwise, there are outliers [-8.62, 4.60]

#Influential observatons
# An observation that changes the slope of the line. 
# They have a large influence on the fit of the model. 

#influntial Outlier - cook's distance
cooksd_mod3 <- sort(cooks.distance(mmodel_3))

#plot Cook's distance, show the values that are greater than one
plot(cooksd_mod3, pch="*", cex=2, main="Influnential observations by Cooks distance for Model 2")
abline(h=4*mean(cooksd_mod3, na.rm=T), col="red") #cutoff line
text(x=1:length(cooksd_mod3)+1, y=cooksd_mod3, labels=ifelse(cooksd_mod3>4*mean(cooksd_mod3, na.rom=T), names(cooksd_mod3), ""), col="red")

#find rows related to influential observations
influential_model3 <- as.numeric(names(cooksd_mod3)[(cooksd_mod3 > 4*mean(cooksd_mod3, na.rm=T))])  # influential row numbers
stem(influential_model3)
## 
##   The decimal point is 3 digit(s) to the right of the |
## 
##    0 | 001111122233344455566667788888899
##    1 | 0011111111122233333334444455555667788899999
##    2 | 00000000001111111111111222223333333344444555555666666777778888888899
##    3 | 00111122333334444455555667777777888888889999
##    4 | 0000111122233333444455566677777888899999
##    5 | 00000111122222233333333333334444555556666666677778888888899999
##    6 | 000000111111222222222333444445555555667777777777888888888999999
##    7 | 0000000111112222233444444455666677777778889999
##    8 | 000011122222333344444455555556778888889999
##    9 | 000000011122222222333333334444455566666666677777778888999
##   10 | 0000000111113334444555566666666677778889999
##   11 | 000011112222222222333344444555555566667788888899999
##   12 | 0000111111111122222233333334444444
head(data_acperfor[influential_model3, ])  # influential observations.
## # A tibble: 6 x 46
##   cod_s11 gender edu_father edu_mother occ_father occ_mother stratum sisben
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… M      Complete … Complete … Operator   Operator   Stratu… It is…
## 2 SB1120… M      Complete … Complete … Operator   Operator   Stratu… Level…
## 3 SB1120… M      Incomplet… Complete … Independe… Operator   Stratu… Level…
## 4 SB1120… M      Incomplet… Complete … Independe… Operator   Stratu… It is…
## 5 SB1120… M      Complete … Complete … Technical… Home       Stratu… It is…
## 6 SB1120… F      Incomplet… Complete … Operator   Operator   Stratu… Level…
## # … with 38 more variables: people_house <chr>, internet <chr>, tv <chr>,
## #   computer <chr>, washing_mch <chr>, mic_oven <chr>, car <chr>, dvd <chr>,
## #   fresh <chr>, phone <chr>, mobile <chr>, revenue <chr>, job <chr>,
## #   school_name <chr>, school_nat <chr>, school_type <chr>, mat_s11 <dbl>,
## #   cr_s11 <dbl>, cc_s11 <dbl>, bio_s11 <dbl>, eng_s11 <dbl>, cod_spro <chr>,
## #   university <chr>, academic_program <chr>, qr_pro <dbl>, cr_pro <dbl>,
## #   cc_pro <dbl>, eng_pro <dbl>, wc_pro <dbl>, fep_pro <dbl>, g_sc <dbl>,
## #   percentile <dbl>, `2nd_decile` <dbl>, quartile <dbl>, sel <dbl>,
## #   sel_ihe <dbl>, sex <dbl>, interaction <dbl>
head(data_acperfor[influential_model3, ]$mat_s11)
## [1] 66 47 61 65 55 54
head(data_acperfor[influential_model3, ]$sex)
## [1] 0 0 0 0 0 1
head(data_acperfor[influential_model3, ]$interaction)
## [1]  9174  4230  3355 12935  5170  7506
car::outlierTest(mmodel_3)
##       rstudent unadjusted p-value Bonferroni p
## 1336 -8.642510         6.1691e-18   7.6565e-14
## 7721 -5.614364         2.0152e-08   2.5011e-04
# Bonferonni p-value for most extreme obs 
# Are there any cases where the outcome variable has an unusual variable for its predictor values?

Report outliers for model 3: An analysis of standard residuals was carried out on the data to identify any outliers, which indicated two cases for concern which were deleted.

# leverage points: An observation that has a value of x that is far away from the mean of x
car::leveragePlots(mmodel_3)

#Assess homocedasticity: ZRESID y vs ZPRED x
# ZPRED (the standardized predicted values of the dependent variable based on the model). These values are
# standardized forms of the values predicted by the model.

# ZRESID (the standardized residuals, or errors). These values are the standardized differences between the
# observed data and the values that the model predicts).

plot(mmodel_3,1)

#from the homcedasticity plot: assumptions met
plot(mmodel_3,3)

# Normality of residuals
# create histogram and density plot of the residuals
plot(density(resid(mmodel_3)))

#create QQ plot for standanside residuals
car::qqPlot(mmodel_3, main="Normal QQ plot of Regression Standardized Residual for mmodel_3")

## [1] 1336 7721

Report for Random Normal Distribution of errors for model 3: The histogram of standardized residuals indicated that the data contained normally distributed errors, as did the normal Q-Q plot of standardized residuals, which showed most points were extremely close to the line.

# non-zero variances
varsmodel <- c("interaction", "mat_s11","sex")
model_predictor <- data_acperfor[varsmodel]
pastecs::stat.desc(model_predictor, basic=F)
##               interaction     mat_s11         sex
## median       9.238000e+03  64.0000000 0.000000000
## mean         9.481624e+03  64.3207638 0.406333092
## SE.mean      3.225355e+01   0.1065813 0.004408863
## CI.mean.0.95 6.322196e+01   0.2089158 0.008642056
## var          1.291105e+07 140.9835648 0.241245948
## std.dev      3.593196e+03  11.8736500 0.491167943
## coef.var     3.789641e-01   0.1846006 1.208781547

Non-Zero Report for model 3: The data also met the assumption of non-zero variances (Interaction = 1.29e+07 ; Mathematics = 140.98, sex=0.24)

#Collinearity: above 0.8: might be present.
# vif > 2.5 or tolenance < 0.4 might be multicollinearity.
vif_mm3 <- car::vif(mmodel_3)
vif_mm3
##              omitdata$sex          omitdata$mat_s11 data_acperfor$interaction 
##                  1.030075                  1.925229                  1.885766
#Calculate tolerance
1/vif_mm3
##              omitdata$sex          omitdata$mat_s11 data_acperfor$interaction 
##                 0.9708033                 0.5194188                 0.5302886

the result show no Multicollinearity. vif: 1.03, 1.93, 1.89 < 2.5, tolerance 0.97, 0.52, 0.53 > 0.4

Report for colliearity of model 3: Tests to see if the data met the assumption of collinearity indicated that multicollinearity was not a concert(Interaction, Tolerance=0.53, VIF=1.89; Mathematics, Tolerance=0.52, VIF=1.93; Sex, Tolerance=0.97, VIF=1.03).